suppressPackageStartupMessages({
library(SingleCellExperiment)
library(ggplot2)
library(patchwork)
library(caret)
library(UpSetR)
})
theme_set(theme_bw())
module_base <- rprojroot::find_root(rprojroot::is_renv_project)
data_dir <- file.path(module_base, "scratch", "benchmark-datasets")
result_dir <- file.path(module_base, "results", "benchmark-results")
plot_pca_calls <- function(df,
color_column,
pred_column,
color_lab) {
# Plot PCs colored by singlet/doublet, showing doublets on top
# df is expected to contain columns PC1, PC2, `color_column`, and `pred_column`. These should _not_ be provided as strings
ggplot(df) +
aes(x = PC1,
y = PC2,
color = {{color_column}}) +
geom_point(
size = 0.75,
alpha = 0.6
) +
scale_color_manual(name = color_lab, values = c("black", "lightblue")) +
geom_point(
data = dplyr::filter(df, {{color_column}} == "doublet"),
color = "black",
size = 0.75
) +
theme(
legend.title.position = "top",
legend.position = "bottom"
)
}
plot_pca_metrics <- function(df, color_column, false_colors) {
# Plot PCs colored by performance metric, showing false calls on top
# false_colors is a vector of colors used for fn and fp points
# df is expected to contain columns PC1, PC2, and `color_column`. This should _not_ be provided as a string.
ggplot(df) +
aes(x = PC1,
y = PC2,
color = {{color_column}}) +
geom_point(
size = 0.75,
alpha = 0.6
) +
geom_point(
data = dplyr::filter(df, {{color_column}} %in% false_colors),
size = 0.75
) +
scale_color_identity() +
theme(legend.position = "none")
}
First, we’ll read in and combine doublet results into a list of data frames for each dataset. We’ll also two columns for each dataset:
consensus_call, which will be “doublet” if all
methods predict “doublet,” and “singlet” otherwisecall_type, which will classify the consensus call as
one of “tp”, “tn”, “fp”, or “fn” (true/false positive/negative)# find all dataset names to process:
dataset_names <- list.files(result_dir, pattern = "*_scrublet.tsv") |>
stringr::str_remove("_scrublet.tsv")
# used in PCA plots
confusion_colors <- c(
"tp" = "lightblue",
"tn" = "pink",
"fp" = "blue",
"fn" = "firebrick2"
)
# Read in and data for analysis
doublet_df_list <- dataset_names |>
purrr::map(
\(dataset) {
scdbl_tsv <- file.path(result_dir, glue::glue("{dataset}_scdblfinder.tsv"))
scrub_tsv <- file.path(result_dir, glue::glue("{dataset}_scrublet.tsv"))
sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}_sce.rds"))
scdbl_df <- scdbl_tsv |>
readr::read_tsv(show_col_types = FALSE) |>
dplyr::select(
barcodes,
cxds_score,
scdbl_score = score,
scdbl_prediction = class
) |>
# add cxds calls at 0.75 threshold
dplyr::mutate(
cxds_prediction = dplyr::if_else(
cxds_score >= 0.75,
"doublet",
"singlet"
)
)
scrub_df <- readr::read_tsv(scrub_tsv, show_col_types = FALSE)
# grab ground truth and PCA coordinates
sce <- readr::read_rds(sce_file)
sce_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
tibble::rownames_to_column(var = "barcodes") |>
dplyr::select(barcodes,
ground_truth = ground_truth_doublets,
PC1 = PCA.1,
PC2 = PCA.2)
rm(sce)
dataset_df <- scdbl_df |>
dplyr::left_join(
scrub_df,
by = "barcodes"
) |>
dplyr::left_join(
sce_df,
by = "barcodes"
)
# Add a consensus call column
dataset_df <- dataset_df |>
dplyr::rowwise() |>
dplyr::mutate(consensus_call = dplyr::if_else(
all(
c(scdbl_prediction, scrublet_prediction, cxds_prediction) == "doublet"
),
"doublet",
"singlet"
)) |>
dplyr::mutate(
call_type = dplyr::case_when(
consensus_call == "doublet" && ground_truth == "doublet" ~ "tp",
consensus_call == "singlet" && ground_truth == "singlet" ~ "tn",
consensus_call == "doublet" && ground_truth == "singlet" ~ "fp",
consensus_call == "singlet" && ground_truth == "doublet" ~ "fn"
),
# set associated plotting colors
call_type_color = dplyr::case_when(
call_type == "tp" ~ unname(confusion_colors["tp"]),
call_type == "tn" ~ unname(confusion_colors["tn"]),
call_type == "fp" ~ unname(confusion_colors["fp"]),
call_type == "fn" ~ unname(confusion_colors["fn"])
)
)
return(dataset_df)
}
)
names(doublet_df_list) <- dataset_names
This section shows performance metrics for the consensus calls for each dataset.
doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
print(glue::glue("======================== {dataset} ========================"))
cat("Table of consensus calls:")
print(table(df$consensus_call))
cat("\n\n")
caret::confusionMatrix(
# truth should be first
table(
"Truth" = df$ground_truth,
"Consensus prediction" = df$consensus_call
),
positive = "doublet"
) |>
print()
}
)
======================== hm-6k ========================
Table of consensus calls:
doublet singlet
62 6744
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 62 109
singlet 0 6635
Accuracy : 0.984
95% CI : (0.9807, 0.9868)
No Information Rate : 0.9909
P-Value [Acc > NIR] : 1
Kappa : 0.5258
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.00000
Specificity : 0.98384
Pos Pred Value : 0.36257
Neg Pred Value : 1.00000
Prevalence : 0.00911
Detection Rate : 0.00911
Detection Prevalence : 0.02512
Balanced Accuracy : 0.99192
'Positive' Class : doublet
======================== HMEC-orig-MULTI ========================
Table of consensus calls:
doublet singlet
247 26179
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 204 3364
singlet 43 22815
Accuracy : 0.8711
95% CI : (0.867, 0.8751)
No Information Rate : 0.9907
P-Value [Acc > NIR] : 1
Kappa : 0.0911
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.825911
Specificity : 0.871500
Pos Pred Value : 0.057175
Neg Pred Value : 0.998119
Prevalence : 0.009347
Detection Rate : 0.007720
Detection Prevalence : 0.135019
Balanced Accuracy : 0.848705
'Positive' Class : doublet
======================== pbmc-1B-dm ========================
Table of consensus calls:
doublet singlet
13 3777
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 8 122
singlet 5 3655
Accuracy : 0.9665
95% CI : (0.9603, 0.972)
No Information Rate : 0.9966
P-Value [Acc > NIR] : 1
Kappa : 0.1063
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.615385
Specificity : 0.967699
Pos Pred Value : 0.061538
Neg Pred Value : 0.998634
Prevalence : 0.003430
Detection Rate : 0.002111
Detection Prevalence : 0.034301
Balanced Accuracy : 0.791542
'Positive' Class : doublet
======================== pdx-MULTI ========================
Table of consensus calls:
doublet singlet
4 10292
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 3 1314
singlet 1 8978
Accuracy : 0.8723
95% CI : (0.8657, 0.8787)
No Information Rate : 0.9996
P-Value [Acc > NIR] : 1
Kappa : 0.0038
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7500000
Specificity : 0.8723280
Pos Pred Value : 0.0022779
Neg Pred Value : 0.9998886
Prevalence : 0.0003885
Detection Rate : 0.0002914
Detection Prevalence : 0.1279138
Balanced Accuracy : 0.8111640
'Positive' Class : doublet
This section plots the PCA for each dataset, with three color schemes from left to right: - Ground truth doublets are shown in black - Consensus doublets are shown in black - Points are colored as tp, tn, fp, or fn based on comparing the consensus call to the ground truth
# Make a legend for the confusion-colored PCA
legend_plot <- data.frame(
x = factor(names(confusion_colors), levels = names(confusion_colors)), y = 1:4
) |>
ggplot(aes(x = x, y = y, color = x)) +
geom_point(size = 3) +
scale_color_manual(name = "Metric", values = confusion_colors)
confusion_legend <- ggpubr::get_legend(legend_plot) |> ggpubr::as_ggplot()
doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
# First, ground truth
p1 <- plot_pca_calls(
df,
color_column = ground_truth,
color_lab = "Ground truth"
)
# Second, consensus call
p2 <- plot_pca_calls(
df,
color_column = consensus_call,
color_lab = "Consensus call"
)
# Third, call type
p3 <- plot_pca_metrics(
df,
call_type_color,
false_colors = unname(c(confusion_colors["fn"], confusion_colors["fp"]))
)
# combine and plot
plot( p1 + p2 + p3 + confusion_legend + plot_annotation(glue::glue("PCA for {dataset}")) + plot_layout(ncol=4, widths = c(1,1,1,0.25)) )
}
)
This section shows upset plots for overlap among methods for each dataset.
pull_barcodes <- function(df, pred_var) {
# Helper function to pull out barcodes for doublet calls
df$barcodes[df[[pred_var]] == "doublet"]
}
upset_list <- doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
doublet_barcodes <- list(
"scDblFinder" = pull_barcodes(df, "scdbl_prediction"),
"scrublet" = pull_barcodes(df, "scrublet_prediction"),
"cxds" = pull_barcodes(df, "cxds_prediction")
)
UpSetR::upset(fromList(doublet_barcodes), order.by = "freq") |> print()
grid::grid.text(dataset,x = 0.65, y=0.95, gp=grid::gpar(fontsize=16)) # plot title
}
)
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods base
other attached packages:
[1] UpSetR_1.4.0 caret_6.0-94 lattice_0.22-6 patchwork_1.2.0
[5] ggplot2_3.5.1 SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0 Biobase_2.64.0
[9] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0 IRanges_2.38.0 S4Vectors_0.42.0
[13] BiocGenerics_0.50.0 MatrixGenerics_1.16.0 matrixStats_1.3.0
loaded via a namespace (and not attached):
[1] pROC_1.18.5 gridExtra_2.3 rlang_1.1.3 magrittr_2.0.3 e1071_1.7-14
[6] compiler_4.4.0 DelayedMatrixStats_1.26.0 vctrs_0.6.5 reshape2_1.4.4 stringr_1.5.1
[11] pkgconfig_2.0.3 crayon_1.5.2 backports_1.4.1 XVector_0.44.0 labeling_0.4.3
[16] scuttle_1.14.0 utf8_1.2.4 prodlim_2023.08.28 tzdb_0.4.0 UCSC.utils_1.0.0
[21] bit_4.0.5 purrr_1.0.2 xfun_0.43 beachmat_2.20.0 zlibbioc_1.50.0
[26] jsonlite_1.8.8 recipes_1.0.10 DelayedArray_0.30.1 BiocParallel_1.38.0 broom_1.0.5
[31] parallel_4.4.0 R6_2.5.1 stringi_1.8.4 car_3.1-2 parallelly_1.37.1
[36] rpart_4.1.23 lubridate_1.9.3 Rcpp_1.0.12 iterators_1.0.14 knitr_1.46
[41] future.apply_1.11.2 readr_2.1.5 Matrix_1.7-0 splines_4.4.0 nnet_7.3-19
[46] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.16.0 abind_1.4-5 yaml_2.3.8
[51] timeDate_4032.109 codetools_0.2-20 listenv_0.9.1 tibble_3.2.1 plyr_1.8.9
[56] withr_3.0.0 future_1.33.2 survival_3.5-8 proxy_0.4-27 ggpubr_0.6.0
[61] pillar_1.9.0 BiocManager_1.30.23 carData_3.0-5 renv_1.0.7 foreach_1.5.2
[66] generics_0.1.3 vroom_1.6.5 rprojroot_2.0.4 hms_1.1.3 sparseMatrixStats_1.16.0
[71] munsell_0.5.1 scales_1.3.0 globals_0.16.3 class_7.3-22 glue_1.7.0
[76] tools_4.4.0 data.table_1.15.4 ggsignif_0.6.4 ModelMetrics_1.2.2.2 gower_1.0.1
[81] cowplot_1.1.3 grid_4.4.0 tidyr_1.3.1 ipred_0.9-14 colorspace_2.1-0
[86] nlme_3.1-164 GenomeInfoDbData_1.2.12 cli_3.6.2 fansi_1.0.6 S4Arrays_1.4.0
[91] lava_1.8.0 dplyr_1.1.4 gtable_0.3.5 rstatix_0.7.2 digest_0.6.35
[96] SparseArray_1.4.3 farver_2.1.1 lifecycle_1.0.4 hardhat_1.3.1 httr_1.4.7
[101] bit64_4.0.5 MASS_7.3-60.2